#include<cuda_runtime.h>
#include "utility.h"
#include "device_launch_parameters.h"

using namespace std;

texture<uint2, 1, cudaReadModeElementType> geno_00_Texture;
texture<uint2, 1, cudaReadModeElementType> geno_01_Texture;
texture<uint2, 1, cudaReadModeElementType> geno_10_Texture;
texture<uint2, 1, cudaReadModeElementType> geno_11_Texture;
texture<unsigned char, 1, cudaReadModeElementType> wordbits_Texture;


long long iDivUp(long long a, long long b) {
	return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

void checkCUDAError(const char *msg) {
	cudaError_t err = cudaGetLastError();
	if (cudaSuccess != err) {
		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
}

inline __device__ int dev_count_bit(__int64 i) {
	i = i - ((i >> 1) & 0x5555555555555555);
	i = (i & 0x3333333333333333) + ((i >> 2) & 0x3333333333333333);
	return (((i + (i >> 4)) & 0xF0F0F0F0F0F0F0F) * 0x101010101010101) >> 56;
}

inline __device__ int dev_count_bit_slow_mult(__int64 x) {
	x -= (x >> 1) & 0x5555555555555555;								//put count of each 2 bits into those 2 bits
	x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); //put count of each 4 bits into those 4 bits 
	x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;						//put count of each 8 bits into those 8 bits 
	x += x >> 8;													//put count of each 16 bits into their lowest 8 bits
	x += x >> 16;													//put count of each 32 bits into their lowest 8 bits
	x += x >> 32;													//put count of each 64 bits into their lowest 8 bits
	return x & 0x7f;
}

//test_CPU debug
__device__ float postCorrection_5(int* localGenoDistr, bool flag)
{
	float mu[576];
	float mu0[576] = { 0 };

	float mu_ijk[18] = { 0 };	//for interacted model
	float mu_ijt1t2t3t4t5[288] = { 0 };
	float mu_kt1t2t3t4t5[64] = { 0 };
	float mu_ik[6] = { 0 };		//without interaction
	float mu_jk[6] = { 0 };		//without interaction
	float* vpointer[5] = { mu_ijk, mu_ijt1t2t3t4t5, mu_kt1t2t3t4t5, mu_ik, mu_jk };



	float n_ijk[18] = { 0 };	//for interacted model
	float n_ijt1t2t3t4t5[288] = { 0 };
	float n_kt1t2t3t4t5[64] = { 0 };
	float n_ik[6] = { 0 };		//without interaction
	float n_jk[6] = { 0 };		//without interaction
	float* npointer[5] = { n_ijk, n_ijt1t2t3t4t5, n_kt1t2t3t4t5, n_ik, n_jk };


	int arraySize[5] = { 18, 288, 64, 6, 6 };

	double muError = 0.0;
	double Likelihood = 0.0;
	double tmp = 0;

	int loopst, loopend;
	for (int index = 0; index < 576; index++)
	{
		mu[index] = 10;
		muError += abs(mu[index] - mu0[index]);
	}

	for (int i = 0; i < 3; i++)
	{
		for (int j = 0; j < 3; j++)
		{
			for (int k = 0; k < 2; k++)
			{
				for (int t1 = 0; t1 < 2; t1++)
				{
					for (int t2 = 0; t2 < 2; t2++)
					{
						for (int t3 = 0; t3 < 2; t3++)
						{
							for (int t4 = 0; t4 < 2; t4++)
							{
								for (int t5 = 0; t5 < 2; t5++)
								{
									n_ijk[i * 6 + j * 2 + k] += localGenoDistr[k * 288 + t1 * 144 + t2 * 72 + t3 * 36 + t4 * 18 + t5 * 9 + i * 3 + j];
									n_ijt1t2t3t4t5[i * 96 + j * 32 + t1 * 16 + t2 * 8 + t3 * 4 + t4 * 2 + t5] += localGenoDistr[k * 288 + t1 * 144 + t2 * 72 + t3 * 36 + t4 * 18 + t5 * 9 + i * 3 + j];
									n_kt1t2t3t4t5[k * 32 + t1 * 16 + t2 * 8 + t3 * 4 + t4 * 2 + t5] += localGenoDistr[k * 288 + t1 * 144 + t2 * 72 + t3 * 36 + t4 * 18 + t5 * 9 + i * 3 + j];
									n_ik[i * 2 + k] += localGenoDistr[k * 288 + t1 * 144 + t2 * 72 + t3 * 36 + t4 * 18 + t5 * 9 + i * 3 + j];
									n_jk[j * 2 + k] += localGenoDistr[k * 288 + t1 * 144 + t2 * 72 + t3 * 36 + t4 * 18 + t5 * 9 + i * 3 + j];
								}
							}
						}
					}
				}
			}
		}
	}


	while (muError > 1)
	{
		for (int i = 0; i < 576; i++)
		{
			mu0[i] = mu[i];
		}

		if (flag)
		{
			loopst = 0;
			loopend = 3;
		}
		else
		{
			loopst = 1;
			loopend = 5;
		}

		for (int outerloop = loopst; outerloop < loopend; outerloop++)
		{
			for (int i = 0; i < arraySize[outerloop]; i++)
			{
				vpointer[outerloop][i] = 0;
			}

			for (int i = 0; i < 3; i++)
			{
				for (int j = 0; j < 3; j++)
				{
					for (int k = 0; k < 2; k++)
					{
						for (int t1 = 0; t1 < 2; t1++)
						{
							for (int t2 = 0; t2 < 2; t2++)
							{
								for (int t3 = 0; t3 < 2; t3++)
								{
									for (int t4 = 0; t4 < 2; t4++)
									{
										for (int t5 = 0; t5 < 2; t5++)
										{

											int index = 0;
											if (outerloop == 0)index = i * 6 + j * 2 + k;
											else if (outerloop == 1)index = i * 96 + j * 32 + t1 * 16 + t2 * 8 + t3 * 4 + t4 * 2 + t5;
											else if (outerloop == 2) index = k * 32 + t1 * 16 + t2 * 8 + t3 * 4 + t4 * 2 + t5;
											else if (outerloop == 3)index = i * 2 + k;
											else if (outerloop == 4)index = j * 2 + k;
											vpointer[outerloop][index] += mu[k * 288 + t1 * 144 + t2 * 72 + t3 * 36 + t4 * 18 + t5 * 9 + i * 3 + j];
										}
									}
								}
							}
						}
					}
				}
			}

			for (int i = 0; i < 3; i++)
			{
				for (int j = 0; j < 3; j++)
				{
					for (int k = 0; k < 2; k++)
					{
						for (int t1 = 0; t1 < 2; t1++)
						{
							for (int t2 = 0; t2 < 2; t2++)
							{
								for (int t3 = 0; t3 < 2; t3++)
								{
									for (int t4 = 0; t4 < 2; t4++)
									{
										for (int t5 = 0; t5 < 2; t5++)
										{
											int index = 0;
											if (outerloop == 0)index = i * 6 + j * 2 + k;
											else if (outerloop == 1)index = i * 96 + j * 32 + t1 * 16 + t2 * 8 + t3 * 4 + t4 * 2 + t5;
											else if (outerloop == 2) index = k * 32 + t1 * 16 + t2 * 8 + t3 * 4 + t4 * 2 + t5;
											else if (outerloop == 3)index = i * 2 + k;
											else if (outerloop == 4)index = j * 2 + k;

											if (vpointer[outerloop][index]>0)
												mu[k * 288 + t1 * 144 + t2 * 72 + t3 * 36 + t4 * 18 + t5 * 9 + i * 3 + j] *= (npointer[outerloop][index] / vpointer[outerloop][index]);
											else
												mu[k * 288 + t1 * 144 + t2 * 72 + t3 * 36 + t4 * 18 + t5 * 9 + i * 3 + j] = 0;
										}
									}
								}
							}
						}
					}
				}
			}

		}

		muError = 0;
		for (int i = 0; i < 576; i++)
		{
			muError += abs(mu[i] - mu0[i]);
		}

	}
	for (int i = 0; i < 3; i++)
	{
		for (int j = 0; j < 3; j++)
		{
			for (int k = 0; k < 2; k++)
			{
				for (int t1 = 0; t1 < 2; t1++)
				{
					for (int t2 = 0; t2 < 2; t2++)
					{
						for (int t3 = 0; t3 < 2; t3++)
						{
							for (int t4 = 0; t4 < 2; t4++)
							{
								for (int t5 = 0; t5 < 2; t5++)
								{
									tmp = localGenoDistr[k * 288 + t1 * 144 + t2 * 72 + t3 * 36 + t4 * 18 + t5 * 9 + i * 3 + j];
									if (mu[k * 288 + t1 * 144 + t2 * 72 + t3 * 36 + t4 * 18 + t5 * 9 + i * 3 + j]>0)
										Likelihood += tmp*log(mu[k * 288 + t1 * 144 + t2 * 72 + t3 * 36 + t4 * 18 + t5 * 9 + i * 3 + j]);
								}
							}
						}
					}
				}
			}
		}
	}
	return Likelihood;
}

__device__ float postCorrection_1(int* localGenoDistr, bool flag)
{
	float mu[36];
	float mu0[36] = { 0 };
	float mu_ik[6] = { 0 };
	float mu_jk[6] = { 0 };
	float mu_kt[4] = { 0 };
	float mu_ijt[18] = { 0 };
	float mu_ijk[18] = { 0 };

	float n_ik[6] = { 0 };
	float n_jk[6] = { 0 };
	float n_kt[4] = { 0 };
	float n_ijk[18] = { 0 };
	float n_ijt[18] = { 0 };

	float muError = 36.0;
	float Likelihood = 0.0;
	float tmp = 0;
	for (int i = 0; i < 36; i++)mu[i] = 1;

	for (int i = 0; i < 3; i++)
	{
		for (int k = 0; k < 2; k++)
		{
			for (int j = 0; j < 3; j++)
			{
				for (int t = 0; t < 2; t++)
				{
					n_ik[i * 2 + k] += localGenoDistr[k * 18 + t * 9 + i * 3 + j];
					n_jk[j * 2 + k] += localGenoDistr[k * 18 + t * 9 + i * 3 + j];
					n_kt[k * 2 + t] += localGenoDistr[k * 18 + t * 9 + i * 3 + j];
					n_ijt[i * 6 + j * 2 + t] += localGenoDistr[k * 18 + t * 9 + i * 3 + j];
					n_ijk[i * 6 + j * 2 + k] += localGenoDistr[k * 18 + t * 9 + i * 3 + j];
				}
			}
		}
	}
	while (muError>1)
	{
		for (int i = 0; i<36; i++)
		{
			mu0[i] = mu[i];
		}
		//step2: mu_ik and n_ik
		if (!flag)
		{
			for (int i = 0; i < 6; i++)
			{
				mu_ik[i] = 0;
				//n_ik[i] = 0;
			}
			for (int i = 0; i < 3; i++)
			{
				for (int k = 0; k < 2; k++)
				{
					for (int j = 0; j < 3; j++)
					{
						for (int t = 0; t < 2; t++)
						{
							mu_ik[i * 2 + k] += mu[i * 12 + j * 4 + k * 2 + t];
							//n_ik[i * 2 + k] += localGenoDistr[k * 18 + t * 9 + i * 3 + j];
						}
					}
				}
			}

			//mu_ijkt=mu_ijkt*n_ik/mu_ik

			for (int i = 0; i < 3; i++)
			{
				for (int j = 0; j < 3; j++)
				{
					for (int k = 0; k < 2; k++)
					{
						for (int t = 0; t < 2; t++)
						{
							if (mu_ik[i * 2 + k] > 0)
								mu[i * 12 + j * 4 + k * 2 + t] = mu[i * 12 + j * 4 + k * 2 + t] * n_ik[i * 2 + k] / mu_ik[i * 2 + k];
							else
								mu[i * 12 + j * 4 + k * 2 + t] = 0;
						}
					}
				}
			}

			//step3:mu_jk and n_jk
			for (int i = 0; i < 6; i++)
			{
				mu_jk[i] = 0;
				//n_jk[i] = 0;
			}
			for (int j = 0; j < 3; j++)
			{
				for (int k = 0; k < 2; k++)
				{
					for (int i = 0; i < 3; i++)
					{
						for (int t = 0; t < 2; t++)
						{
							mu_jk[j * 2 + k] += mu[i * 12 + j * 4 + k * 2 + t];
							//n_jk[j * 2 + k] += localGenoDistr[k * 18 + t * 9 + i * 3 + j];
						}
					}
				}
			}

			//mu_ijkt = mu_ijkt*n_jk/mu_jk
			for (int i = 0; i < 3; i++)
			{
				for (int j = 0; j < 3; j++)
				{
					for (int k = 0; k < 2; k++)
					{
						for (int t = 0; t < 2; t++)
						{
							if (mu_jk[j * 2 + k] > 0)
								mu[i * 12 + j * 4 + k * 2 + t] = mu[i * 12 + j * 4 + k * 2 + t] * n_jk[j * 2 + k] / mu_jk[j * 2 + k];
							else
								mu[i * 12 + j * 4 + k * 2 + t] = 0;
						}
					}
				}
			}
		}
		//step4: mu_kt and n_kt
		for (int i = 0; i<4; i++)
		{
			mu_kt[i] = 0;
			//n_kt[i] = 0;
		}
		for (int k = 0; k<2; k++)
		{
			for (int t = 0; t<2; t++)
			{
				for (int i = 0; i<3; i++)
				{
					for (int j = 0; j<3; j++)
					{
						mu_kt[k * 2 + t] += mu[i * 12 + j * 4 + k * 2 + t];
						//n_kt[k * 2 + t] += localGenoDistr[k * 18 + t * 9 + i * 3 + j];
					}
				}
			}
		}

		//mu_ijkt = mu_ijkt*n_kt/mu_kt
		for (int i = 0; i<3; i++)
		{
			for (int j = 0; j<3; j++)
			{
				for (int k = 0; k<2; k++)
				{
					for (int t = 0; t<2; t++)
					{
						if (mu_kt[k * 2 + t]>0)
							mu[i * 12 + j * 4 + k * 2 + t] = mu[i * 12 + j * 4 + k * 2 + t] * n_kt[k * 2 + t] / mu_kt[k * 2 + t];
						else
							mu[i * 12 + j * 4 + k * 2 + t] = 0;
					}
				}
			}
		}
		//mu_ijkt = mu_ijkt*n_ijt/mu_ijt
		for (int i = 0; i<18; i++)
		{
			mu_ijt[i] = 0;
			//n_ijt[i] = 0;
		}
		for (int i = 0; i<3; i++)
		{
			for (int j = 0; j<3; j++)
			{
				for (int t = 0; t<2; t++)
				{
					for (int k = 0; k<2; k++)
					{
						mu_ijt[i * 6 + j * 2 + t] += mu[i * 12 + j * 4 + k * 2 + t];
						//n_ijt[i * 6 + j * 2 + t] += localGenoDistr[k * 18 + t * 9 + i * 3 + j];
					}
				}
			}
		}

		for (int i = 0; i<3; i++)
		{
			for (int j = 0; j<3; j++)
			{
				for (int k = 0; k<2; k++)
				{
					for (int t = 0; t<2; t++)
					{
						if (mu_ijt[i * 6 + j * 2 + t]>0)
							mu[i * 12 + j * 4 + k * 2 + t] = mu[i * 12 + j * 4 + k * 2 + t] * n_ijt[i * 6 + j * 2 + t] / mu_ijt[i * 6 + j * 2 + t];
						else
							mu[i * 12 + j * 4 + k * 2 + t] = 0;
					}
				}
			}
		}

		//step7: for association detection
		//mu_ijkt = mu_ijkt*n_ijk/mu_ijk
		if (flag)
		{
			for (int i = 0; i<18; i++)
			{
				mu_ijk[i] = 0;
				//n_ijk[i] = 0;
			}

			for (int i = 0; i<3; i++)
			{
				for (int j = 0; j<3; j++)
				{
					for (int k = 0; k<2; k++)
					{
						for (int t = 0; t<2; t++)
						{
							mu_ijk[i * 6 + j * 2 + k] += mu[i * 12 + j * 4 + k * 2 + t];
							//n_ijk[i * 6 + j * 2 + k] += localGenoDistr[k * 18 + t * 9 + i * 3 + j];
						}
					}
				}
			}

			for (int i = 0; i<3; i++)
			{
				for (int j = 0; j<3; j++)
				{
					for (int k = 0; k<2; k++)
					{
						for (int t = 0; t<2; t++)
						{
							if (mu_ijk[i * 6 + j * 2 + k]>0)
								mu[i * 12 + j * 4 + k * 2 + t] = mu[i * 12 + j * 4 + k * 2 + t] * n_ijk[i * 6 + j * 2 + k] / mu_ijk[i * 6 + j * 2 + k];
							else
								mu[i * 12 + j * 4 + k * 2 + t] = 0;
						}
					}
				}
			}
		}

		muError = 0.0;
		for (int index = 0; index<36; index++)
		{
			muError += abs(mu[index] - mu0[index]);
		}



	}

	for (int i = 0; i<3; i++)
	{
		for (int j = 0; j<3; j++)
		{
			for (int k = 0; k<2; k++)
			{
				for (int t = 0; t<2; t++)
				{
					tmp = localGenoDistr[k * 18 + t * 9 + i * 3 + j];
					if (mu[i * 12 + j * 4 + k * 2 + t]>0)
					{
						Likelihood += tmp*log(mu[i * 12 + j * 4 + k * 2 + t]);
					}
				}
			}
		}
	}

	return Likelihood;
}

__device__ float postCorrection_2(int* localGenoDistr, bool flag)
{
	float mu[72];
	float mu0[72] = { 0 };
	float mu_ijt1[18] = { 0 };
	float mu_ijt2[18] = { 0 };
	float mu_ijk[18] = { 0 };
	float mu_t1t2[4] = { 0 };
	float mu_kt1[4] = { 0 };
	float mu_kt2[4] = { 0 };
	float mu_ij[9] = { 0 };
	float mu_ik[6] = { 0 };
	float mu_jk[6] = { 0 };
	float* vpointer[9] = { mu_ijk, mu_ijt1, mu_ijt2, mu_t1t2, mu_kt1, mu_kt2, mu_ij, mu_ik, mu_jk };


	float n_ijt1[18] = { 0 };
	float n_ijt2[18] = { 0 };
	float n_ijk[18] = { 0 };
	float n_t1t2[4] = { 0 };
	float n_kt1[4] = { 0 };
	float n_kt2[4] = { 0 };
	float n_ij[9] = { 0 };
	float n_ik[6] = { 0 };
	float n_jk[6] = { 0 };

	float* npointer[9] = { n_ijk, n_ijt1, n_ijt2, n_t1t2, n_kt1, n_kt2, n_ij, n_ik, n_jk };
	int arraySize[9] = { 18, 18, 18, 4, 4, 4, 9, 6, 6 };

	double muError = 0.0;
	double Likelihood = 0.0;
	double tmp = 0;

	int loopst, loopend;
	for (int index = 0; index < 72; index++)
	{
		mu[index] = 100;
		muError += abs(mu[index] - mu0[index]);
	}

	for (int i = 0; i < 3; i++)
	{
		for (int j = 0; j < 3; j++)
		{
			for (int k = 0; k < 2; k++)
			{
				for (int t1 = 0; t1 < 2; t1++)
				{
					for (int t2 = 0; t2 < 2; t2++)
					{
						n_ijk[i * 6 + j * 2 + k] += localGenoDistr[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j];
						n_ijt1[i * 6 + j * 2 + t1] += localGenoDistr[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j];
						n_ijt2[i * 6 + j * 2 + t2] += localGenoDistr[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j];
						n_t1t2[t1 * 2 + t2] += localGenoDistr[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j];
						n_kt1[k * 2 + t1] += localGenoDistr[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j];
						n_kt2[k * 2 + t2] += localGenoDistr[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j];
						n_ij[i * 3 + j] += localGenoDistr[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j];
						n_ik[i * 2 + k] += localGenoDistr[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j];
						n_jk[j * 2 + k] += localGenoDistr[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j];
					}
				}
			}
		}
	}


	while (muError > 1)
	{
		for (int i = 0; i < 72; i++)
		{
			mu0[i] = mu[i];
		}

		if (flag)
		{
			loopst = 0;
			loopend = 6;
		}
		else
		{
			loopst = 1;
			loopend = 9;
		}

		for (int outerloop = loopst; outerloop < loopend; outerloop++)
		{
			for (int i = 0; i < arraySize[outerloop]; i++)
			{
				vpointer[outerloop][i] = 0;
			}

			for (int i = 0; i < 3; i++)
			{
				for (int j = 0; j < 3; j++)
				{
					for (int k = 0; k < 2; k++)
					{
						for (int t1 = 0; t1 < 2; t1++)
						{
							for (int t2 = 0; t2 < 2; t2++)
							{
								int index = 0;
								if (outerloop == 0)index = i * 6 + j * 2 + k;
								else if (outerloop == 1)index = i * 6 + j * 2 + t1;
								else if (outerloop == 2) index = i * 6 + j * 2 + t2;
								else if (outerloop == 3)index = t1 * 2 + t2;
								else if (outerloop == 4)index = k * 2 + t1;
								else if (outerloop == 5)index = k * 2 + t2;
								else if (outerloop == 6)index = i * 3 + j;
								else if (outerloop == 7)index = i * 2 + k;
								else if (outerloop == 8) index = j * 2 + k;

								vpointer[outerloop][index] += mu[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j];
							}
						}
					}
				}
			}

			for (int i = 0; i < 3; i++)
			{
				for (int j = 0; j < 3; j++)
				{
					for (int k = 0; k < 2; k++)
					{
						for (int t1 = 0; t1 < 2; t1++)
						{
							for (int t2 = 0; t2 < 2; t2++)
							{
								int index = 0;
								if (outerloop == 0)index = i * 6 + j * 2 + k;
								else if (outerloop == 1)index = i * 6 + j * 2 + t1;
								else if (outerloop == 2) index = i * 6 + j * 2 + t2;
								else if (outerloop == 3)index = t1 * 2 + t2;
								else if (outerloop == 4)index = k * 2 + t1;
								else if (outerloop == 5)index = k * 2 + t2;
								else if (outerloop == 6)index = i * 3 + j;
								else if (outerloop == 7)index = i * 2 + k;
								else if (outerloop == 8) index = j * 2 + k;

								if (vpointer[outerloop][index]>0)
									mu[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j] *= (npointer[outerloop][index] / vpointer[outerloop][index]);
								else
									mu[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j] = 0;
							}
						}
					}
				}
			}

		}

		muError = 0;
		for (int i = 0; i < 72; i++)
		{
			muError += abs(mu[i] - mu0[i]);
		}

	}
	for (int i = 0; i < 3; i++)
	{
		for (int j = 0; j < 3; j++)
		{
			for (int k = 0; k < 2; k++)
			{
				for (int t1 = 0; t1 < 2; t1++)
				{
					for (int t2 = 0; t2 < 2; t2++)
					{
						tmp = localGenoDistr[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j];
						if (mu[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j]>0)
							Likelihood += tmp*log(mu[k * 36 + t1 * 18 + t2 * 9 + i * 3 + j]);
					}
				}
			}
		}
	}
	return Likelihood;
}

__device__ float postCorrection_3(int* localGenoDistr, bool flag)
{
	float mu[144];
	float mu0[144] = { 0 };

	float mu_ijk[18] = { 0 };	//for interacted model
	float mu_ijt1t2t3[72] = { 0 };
	float mu_kt1t2t3[16] = { 0 };
	float mu_ik[6] = { 0 };		//without interaction
	float mu_jk[6] = { 0 };		//without interaction
	float* vpointer[5] = { mu_ijk, mu_ijt1t2t3, mu_kt1t2t3, mu_ik, mu_jk };

	float n_ijk[18] = { 0 };	//for interacted model
	float n_ijt1t2t3[72] = { 0 };
	float n_kt1t2t3[16] = { 0 };
	float n_ik[6] = { 0 };		//without interaction
	float n_jk[6] = { 0 };		//without interaction
	float* npointer[5] = { n_ijk, n_ijt1t2t3, n_kt1t2t3, n_ik, n_jk };

	/*float n_ijt1[18] = { 0 };
	float n_ijt2[18] = { 0 };
	float n_ijk[18] = { 0 };
	float n_t1t2[4] = { 0 };
	float n_kt1[4] = { 0 };
	float n_kt2[4] = { 0 };
	float n_ij[9] = { 0 };
	float n_ik[6] = { 0 };
	float n_jk[6] = { 0 };

	float* npointer[9] = { n_ijk, n_ijt1, n_ijt2, n_t1t2, n_kt1, n_kt2, n_ij, n_ik, n_jk };
	*/
	int arraySize[5] = { 18, 72, 16, 6, 6 };

	double muError = 0.0;
	double Likelihood = 0.0;
	double tmp = 0;

	int loopst, loopend;
	for (int index = 0; index < 144; index++)
	{
		mu[index] = 100;
		muError += abs(mu[index] - mu0[index]);
	}

	for (int i = 0; i < 3; i++)
	{
		for (int j = 0; j < 3; j++)
		{
			for (int k = 0; k < 2; k++)
			{
				for (int t1 = 0; t1 < 2; t1++)
				{
					for (int t2 = 0; t2 < 2; t2++)
					{
						for (int t3 = 0; t3 < 2; t3++)
						{
							n_ijk[i * 6 + j * 2 + k] += localGenoDistr[k * 72 + t1 * 36 + t2 * 18 + t3 * 9 + i * 3 + j];
							n_ijt1t2t3[i * 24 + j * 8 + t1 * 4 + t2 * 2 + t3] += localGenoDistr[k * 72 + t1 * 36 + t2 * 18 + t3 * 9 + i * 3 + j];
							n_kt1t2t3[k * 8 + t1 * 4 + t2 * 2 + t3] += localGenoDistr[k * 72 + t1 * 36 + t2 * 18 + t3 * 9 + i * 3 + j];
							n_ik[i * 2 + k] += localGenoDistr[k * 72 + t1 * 36 + t2 * 18 + t3 * 9 + i * 3 + j];
							n_jk[j * 2 + k] += localGenoDistr[k * 72 + t1 * 36 + t2 * 18 + t3 * 9 + i * 3 + j];
						}
					}
				}
			}
		}
	}


	while (muError > 1)
	{
		for (int i = 0; i < 144; i++)
		{
			mu0[i] = mu[i];
		}

		if (flag)
		{
			loopst = 0;
			loopend = 3;
		}
		else
		{
			loopst = 1;
			loopend = 5;
		}

		for (int outerloop = loopst; outerloop < loopend; outerloop++)
		{
			for (int i = 0; i < arraySize[outerloop]; i++)
			{
				vpointer[outerloop][i] = 0;
			}

			for (int i = 0; i < 3; i++)
			{
				for (int j = 0; j < 3; j++)
				{
					for (int k = 0; k < 2; k++)
					{
						for (int t1 = 0; t1 < 2; t1++)
						{
							for (int t2 = 0; t2 < 2; t2++)
							{
								for (int t3 = 0; t3 < 2; t3++)
								{
									int index = 0;
									if (outerloop == 0)index = i * 6 + j * 2 + k;
									else if (outerloop == 1)index = i * 24 + j * 8 + t1 * 4 + t2 * 2 + t3;
									else if (outerloop == 2) index = k * 8 + t1 * 4 + t2 * 2 + t3;
									else if (outerloop == 3)index = i * 2 + k;
									else if (outerloop == 4)index = j * 2 + k;
									vpointer[outerloop][index] += mu[k * 72 + t1 * 36 + t2 * 18 + t3 * 9 + i * 3 + j];
								}
							}
						}
					}
				}
			}

			for (int i = 0; i < 3; i++)
			{
				for (int j = 0; j < 3; j++)
				{
					for (int k = 0; k < 2; k++)
					{
						for (int t1 = 0; t1 < 2; t1++)
						{
							for (int t2 = 0; t2 < 2; t2++)
							{
								for (int t3 = 0; t3 < 2; t3++)
								{
									int index = 0;
									if (outerloop == 0)index = i * 6 + j * 2 + k;
									else if (outerloop == 1)index = i * 24 + j * 8 + t1 * 4 + t2 * 2 + t3;
									else if (outerloop == 2) index = k * 8 + t1 * 4 + t2 * 2 + t3;
									else if (outerloop == 3)index = i * 2 + k;
									else if (outerloop == 4)index = j * 2 + k;

									if (vpointer[outerloop][index]>0)
										mu[k * 72 + t1 * 36 + t2 * 18 + t3 * 9 + i * 3 + j] *= (npointer[outerloop][index] / vpointer[outerloop][index]);
									else
										mu[k * 72 + t1 * 36 + t2 * 18 + t3 * 9 + i * 3 + j] = 0;
								}
							}
						}
					}
				}
			}

		}

		muError = 0;
		for (int i = 0; i < 144; i++)
		{
			muError += abs(mu[i] - mu0[i]);
		}

	}
	for (int i = 0; i < 3; i++)
	{
		for (int j = 0; j < 3; j++)
		{
			for (int k = 0; k < 2; k++)
			{
				for (int t1 = 0; t1 < 2; t1++)
				{
					for (int t2 = 0; t2 < 2; t2++)
					{
						for (int t3 = 0; t3 < 2; t3++)
						{
							tmp = localGenoDistr[k * 72 + t1 * 36 + t2 * 18 + t3 * 9 + i * 3 + j];
							if (mu[k * 72 + t1 * 36 + t2 * 18 + t3 * 9 + i * 3 + j]>0)
								Likelihood += tmp*log(mu[k * 72 + t1 * 36 + t2 * 18 + t3 * 9 + i * 3 + j]);
						}
					}
				}
			}
		}
	}
	return Likelihood;
}

__device__ float postCorrection_4(int* localGenoDistr, bool flag)
{
	float mu[288];
	float mu0[288] = { 0 };

	float mu_ijk[18] = { 0 };	//for interacted model
	float mu_ijt1t2t3t4[144] = { 0 };
	float mu_kt1t2t3t4[32] = { 0 };
	float mu_ik[6] = { 0 };		//without interaction
	float mu_jk[6] = { 0 };		//without interaction
	float* vpointer[5] = { mu_ijk, mu_ijt1t2t3t4, mu_kt1t2t3t4, mu_ik, mu_jk };



	float n_ijk[18] = { 0 };	//for interacted model
	float n_ijt1t2t3t4[144] = { 0 };
	float n_kt1t2t3t4[32] = { 0 };
	float n_ik[6] = { 0 };		//without interaction
	float n_jk[6] = { 0 };		//without interaction
	float* npointer[5] = { n_ijk, n_ijt1t2t3t4, n_kt1t2t3t4, n_ik, n_jk };

	int arraySize[5] = { 18, 144, 32, 6, 6 };

	double muError = 0.0;
	double Likelihood = 0.0;
	double tmp = 0;

	int loopst, loopend;
	for (int index = 0; index < 288; index++)
	{
		mu[index] = 100;
		muError += abs(mu[index] - mu0[index]);
	}

	for (int i = 0; i < 3; i++)
	{
		for (int j = 0; j < 3; j++)
		{
			for (int k = 0; k < 2; k++)
			{
				for (int t1 = 0; t1 < 2; t1++)
				{
					for (int t2 = 0; t2 < 2; t2++)
					{
						for (int t3 = 0; t3 < 2; t3++)
						{
							for (int t4 = 0; t4 < 2; t4++)
							{
								n_ijk[i * 6 + j * 2 + k] += localGenoDistr[k * 144 + t1 * 72 + t2 * 36 + t3 * 18 + t4 * 9 + i * 3 + j];
								n_ijt1t2t3t4[i * 48 + j * 16 + t1 * 8 + t2 * 4 + t3 * 2 + t4] += localGenoDistr[k * 144 + t1 * 72 + t2 * 36 + t3 * 18 + t4 * 9 + i * 3 + j];
								n_kt1t2t3t4[k * 16 + t1 * 8 + t2 * 4 + t3 * 2 + t4] += localGenoDistr[k * 144 + t1 * 72 + t2 * 36 + t3 * 18 + t4 * 9 + i * 3 + j];
								n_ik[i * 2 + k] += localGenoDistr[k * 144 + t1 * 72 + t2 * 36 + t3 * 18 + t4 * 9 + i * 3 + j];
								n_jk[j * 2 + k] += localGenoDistr[k * 144 + t1 * 72 + t2 * 36 + t3 * 18 + t4 * 9 + i * 3 + j];
							}
						}
					}
				}
			}
		}
	}


	while (muError > 1)
	{
		for (int i = 0; i < 288; i++)
		{
			mu0[i] = mu[i];
		}

		if (flag)
		{
			loopst = 0;
			loopend = 3;
		}
		else
		{
			loopst = 1;
			loopend = 5;
		}

		for (int outerloop = loopst; outerloop < loopend; outerloop++)
		{
			for (int i = 0; i < arraySize[outerloop]; i++)
			{
				vpointer[outerloop][i] = 0;
			}

			for (int i = 0; i < 3; i++)
			{
				for (int j = 0; j < 3; j++)
				{
					for (int k = 0; k < 2; k++)
					{
						for (int t1 = 0; t1 < 2; t1++)
						{
							for (int t2 = 0; t2 < 2; t2++)
							{
								for (int t3 = 0; t3 < 2; t3++)
								{
									for (int t4 = 0; t4 < 2; t4++)
									{

										int index = 0;
										if (outerloop == 0)index = i * 6 + j * 2 + k;
										else if (outerloop == 1)index = i * 48 + j * 16 + t1 * 8 + t2 * 4 + t3 * 2 + t4;
										else if (outerloop == 2) index = k * 16 + t1 * 8 + t2 * 4 + t3 * 2 + t4;
										else if (outerloop == 3)index = i * 2 + k;
										else if (outerloop == 4)index = j * 2 + k;
										vpointer[outerloop][index] += mu[k * 144 + t1 * 72 + t2 * 36 + t3 * 18 + t4 * 9 + i * 3 + j];
									}
								}
							}
						}
					}
				}
			}

			for (int i = 0; i < 3; i++)
			{
				for (int j = 0; j < 3; j++)
				{
					for (int k = 0; k < 2; k++)
					{
						for (int t1 = 0; t1 < 2; t1++)
						{
							for (int t2 = 0; t2 < 2; t2++)
							{
								for (int t3 = 0; t3 < 2; t3++)
								{
									for (int t4 = 0; t4 < 2; t4++)
									{
										int index = 0;
										if (outerloop == 0)index = i * 6 + j * 2 + k;
										else if (outerloop == 1)index = i * 48 + j * 16 + t1 * 8 + t2 * 4 + t3 * 2 + t4;
										else if (outerloop == 2) index = k * 16 + t1 * 8 + t2 * 4 + t3 * 2 + t4;
										else if (outerloop == 3)index = i * 2 + k;
										else if (outerloop == 4)index = j * 2 + k;

										if (vpointer[outerloop][index]>0)
											mu[k * 144 + t1 * 72 + t2 * 36 + t3 * 18 + t4 * 9 + i * 3 + j] *= (npointer[outerloop][index] / vpointer[outerloop][index]);
										else
											mu[k * 144 + t1 * 72 + t2 * 36 + t3 * 18 + t4 * 9 + i * 3 + j] = 0;
									}
								}
							}
						}
					}
				}
			}

		}

		muError = 0;
		for (int i = 0; i < 288; i++)
		{
			muError += abs(mu[i] - mu0[i]);
		}

	}
	for (int i = 0; i < 3; i++)
	{
		for (int j = 0; j < 3; j++)
		{
			for (int k = 0; k < 2; k++)
			{
				for (int t1 = 0; t1 < 2; t1++)
				{
					for (int t2 = 0; t2 < 2; t2++)
					{
						for (int t3 = 0; t3 < 2; t3++)
						{
							for (int t4 = 0; t4 < 2; t4++)
							{
								tmp = localGenoDistr[k * 144 + t1 * 72 + t2 * 36 + t3 * 18 + t4 * 9 + i * 3 + j];
								if (mu[k * 144 + t1 * 72 + t2 * 36 + t3 * 18 + t4 * 9 + i * 3 + j]>0)
									Likelihood += tmp*log(mu[k * 144 + t1 * 72 + t2 * 36 + t3 * 18 + t4 * 9 + i * 3 + j]);
							}
						}
					}
				}
			}
		}
	}
	return Likelihood;
}


__global__ void pairwiseTest_Kernel(uint64** genoY_G, int nsnps, int* nlongintY_G, int* interactionInputOffsetJ1, int* interactionInputOffsetJ2, int* pMarginalDistrSNP, int* pMarginalDistrSNP_Y,
	int *interactionPairOffsetJ1, int *interactionPairOffsetJ2, float* interactionPairMeasure, double thresholdRecord, int numofCov){
	__int64 andResult = 0;
	int count = 0;
	int outIndex = blockIdx.x * blockDim.x + threadIdx.x;
	int snp1 = interactionInputOffsetJ1[outIndex];
	int snp2 = interactionInputOffsetJ2[outIndex];

	
	float interactionMeasure = 0;
	int numofTables = 1;
	for (int i = 0; i <= numofCov; i++)
	{
		numofTables *= 2;
	}
	//cudaMalloc((void**)&localGenoDistr, sizeof(int)*numofTables*NumOfCell);
	int localGenoDistr[64 * NumOfCell];

	if (snp2 <= snp1 || snp1 >= nsnps - 1 || snp2 >= nsnps)
	{
		return;
	}

	for (int i = 0; i < 2; i++)
	{
		for (int j = 0; j < 2; j++)
		{
			for (int m = 0; m < numofTables; m++)
			{
				count = 0;
				for (int index = 0; index < nlongintY_G[m]; index++)
				{
					andResult = genoY_G[m][(index * 3 + i)*nsnps + snp1] & genoY_G[m][(index * 3 + j)*nsnps + snp2];
					count += dev_count_bit(andResult);
				}
				localGenoDistr[m*NumOfCell + i * 3 + j] = count;
			}
		}
	}


	for (int m = 0; m < numofTables; m++)
	{
		localGenoDistr[m*NumOfCell + 2] = pMarginalDistrSNP_Y[(0 * numofTables + m)*nsnps + snp1] - localGenoDistr[m*NumOfCell + 0] - localGenoDistr[m*NumOfCell + 1];
		localGenoDistr[m*NumOfCell + 5] = pMarginalDistrSNP_Y[(1 * numofTables + m)*nsnps + snp1] - localGenoDistr[m*NumOfCell + 3] - localGenoDistr[m*NumOfCell + 4];
		localGenoDistr[m*NumOfCell + 6] = pMarginalDistrSNP_Y[(0 * numofTables + m)*nsnps + snp2] - localGenoDistr[m*NumOfCell + 0] - localGenoDistr[m*NumOfCell + 3];
		localGenoDistr[m*NumOfCell + 7] = pMarginalDistrSNP_Y[(1 * numofTables + m)*nsnps + snp2] - localGenoDistr[m*NumOfCell + 1] - localGenoDistr[m*NumOfCell + 4];
		localGenoDistr[m*NumOfCell + 8] = pMarginalDistrSNP_Y[(2 * numofTables + m)*nsnps + snp2] - localGenoDistr[m*NumOfCell + 2] - localGenoDistr[m*NumOfCell + 5];
	}

	if (numofCov==1)
		interactionMeasure = postCorrection_1(localGenoDistr, 1) - postCorrection_1(localGenoDistr, 0);
	else if (numofCov==2)
		interactionMeasure = postCorrection_2(localGenoDistr, 1) - postCorrection_2(localGenoDistr, 0);
	else if (numofCov==3)
		interactionMeasure = postCorrection_3(localGenoDistr, 1) - postCorrection_3(localGenoDistr, 0);
	else if (numofCov == 4)
		interactionMeasure = postCorrection_4(localGenoDistr, 1) - postCorrection_4(localGenoDistr, 0);
	else
		interactionMeasure = postCorrection_5(localGenoDistr, 1) - postCorrection_5(localGenoDistr, 0);

	if (interactionMeasure>thresholdRecord)
	{
		interactionPairOffsetJ1[outIndex] = snp1;
		interactionPairOffsetJ2[outIndex] = snp2;
	}
	else
	{
		interactionPairOffsetJ1[outIndex] = 0;
		interactionPairOffsetJ2[outIndex] = 0;
	}
	interactionPairMeasure[outIndex] = interactionMeasure;
}




extern "C" void cuda_GetInteractionPairs(uint64** genoY_G, int nsnps, int nsamples, int numOfCov, int* nlongintY_G, int* nY_G, int* pMarginalDistrSNP, int* pMarginalDistrSNP_Y,
	const unsigned char* wordbits, int wordBitCount, vector<int>&interactionPairs, vector<double>&interactionMeasure, double thresholdRecord)
{
	printf("\nStarting interaction calculation ...\n");
	//cudaEvent_t evStart, evStop;
	/*cudaEventCreate(&evStart);
	cudaEventCreate(&evStop);

	cudaEventRecord(evStart, 0);*/

	uint64** gpu_genoY_G;
	int* gpu_pMarginalDistrSNP;
	int* gpu_pMarginalDistrSNP_Y;
	int* gpu_inputOffsetJ1;
	int* gpu_inputOffsetJ2;
	unsigned char* gpu_wordbits;
	void* gpu_genoY_G_Array = 0;
	
	cudaMalloc((void**)&gpu_wordbits, sizeof(unsigned char)*wordBitCount);
	cudaMemcpy(gpu_wordbits, wordbits, sizeof(unsigned char)*wordBitCount, cudaMemcpyHostToDevice);
	cudaBindTexture(0, wordbits_Texture, gpu_wordbits, sizeof(unsigned char)*wordBitCount);

	
	fflush(stdout);
	int snp1 = 0, snp2 = snp1 + 1;
	bool firstLoop = true;
	int shiftOffset = 0;
	long long totalTasks = (long long)nsnps*(nsnps - 1) / 2;
	//	long long offset = 0;

	int threadNum = THREAD_NUM;
	int blockNum = BLOCK_NUM;
	int totalNumberOfThreadBlocks = iDivUp(totalTasks, (long long)threadNum);
	int totalNumberOfGridBlock = iDivUp(totalNumberOfThreadBlocks, (long long)blockNum);
	int numOfTable = pow(2, numOfCov + 1);


	int* interactionInputOffsetJ1;
	int* interactionInputOffsetJ2;

	int* gpu_InteractionPairOffsetJ1;
	int* gpu_InteractionPairOffsetJ2;
	int* gpu_nlongintY_G;
	float* gpu_interactionPairMeasure;
	dim3 threads(threadNum, 1, 1);
	dim3 grids(blockNum, 1, 1);

	//float* gpu_floatArray;

	//host memory allocation
	int* interactionPairOffsetJ1 = (int*)calloc(threadNum*blockNum, sizeof(int));
	int* interactionPairOffsetJ2 = (int*)calloc(threadNum*blockNum, sizeof(int));
	float* interactionPairMeasure = (float*)calloc(threadNum*blockNum, sizeof(float));

	cudaHostAlloc((void**)&interactionInputOffsetJ1, sizeof(int)*blockNum*threadNum, cudaHostAllocMapped);
	cudaHostAlloc((void**)&interactionInputOffsetJ2, sizeof(int)*blockNum*threadNum, cudaHostAllocMapped);

	//pass back the device pointer and map with host
	cudaHostGetDevicePointer((void**)&gpu_inputOffsetJ1, (void*)interactionInputOffsetJ1, 0);
	cudaHostGetDevicePointer((void**)&gpu_inputOffsetJ2, (void*)interactionInputOffsetJ2, 0);
	checkCUDAError("Kernel Error!");
	//allocate GPU memmory
	//cudaMalloc((void**)&gpu_genoY_G_Array, sizeof(uint64*)*numOfTable);
	gpu_genoY_G = (uint64**)calloc(numOfTable, sizeof(uint64*));
	//cudaMemcpy(gpu_genoY_G, gpu_genoY_G_Array, sizeof(uint64*)*numOfTable, cudaMemcpyDeviceToHost);
	for (int i = 0; i < numOfTable; i++)
	{
		cudaMalloc((void**)&(gpu_genoY_G[i]), sizeof(uint64) * 3 * nsnps*nlongintY_G[i]);
	}
	checkCUDAError("Kernel Error!");
	cudaMalloc((void**)&gpu_pMarginalDistrSNP, sizeof(int)*numOfTable*nsnps);
	cudaMalloc((void**)&gpu_pMarginalDistrSNP_Y, sizeof(int)*numOfTable*NumOfGenotype*nsnps);
	checkCUDAError("Kernel Error!");
	cudaMalloc((void**)&gpu_InteractionPairOffsetJ1, sizeof(int)*blockNum*threadNum);
	cudaMalloc((void**)&gpu_InteractionPairOffsetJ2, sizeof(int)*blockNum*threadNum);

	cudaMalloc((void**)&gpu_nlongintY_G, sizeof(int)*numOfTable);
	cudaMalloc((void**)&gpu_interactionPairMeasure, sizeof(float)*blockNum*threadNum);

	//copy data and bind as texture
	for (int i = 0; i<numOfTable; i++)
	{
		cudaMemcpy(gpu_genoY_G[i], genoY_G[i], sizeof(uint64)*nlongintY_G[i] * NumOfGenotype*nsnps, cudaMemcpyHostToDevice);
	}
	
	cudaMemcpy(gpu_pMarginalDistrSNP, pMarginalDistrSNP, sizeof(int)*NumOfGenotype*nsnps, cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_pMarginalDistrSNP_Y, pMarginalDistrSNP_Y, sizeof(int)*NumOfGenotype*numOfTable*nsnps, cudaMemcpyHostToDevice);

	cudaMemcpy(gpu_nlongintY_G, nlongintY_G, sizeof(int)*numOfTable, cudaMemcpyHostToDevice);

	

	cudaMalloc(&gpu_genoY_G_Array, sizeof(uint64*)*numOfTable);
	cudaMemcpy(gpu_genoY_G_Array, gpu_genoY_G, sizeof(uint64*)*numOfTable, cudaMemcpyHostToDevice);
	
	cudaBindTexture(0, geno_00_Texture, gpu_genoY_G[0], sizeof(uint64)*nlongintY_G[0] * 3 * nsnps);
	cudaBindTexture(0, geno_01_Texture, gpu_genoY_G[1], sizeof(uint64)*nlongintY_G[1] * 3 * nsnps);
	cudaBindTexture(0, geno_10_Texture, gpu_genoY_G[2], sizeof(uint64)*nlongintY_G[2] * 3 * nsnps);
	cudaBindTexture(0, geno_11_Texture, gpu_genoY_G[3], sizeof(uint64)*nlongintY_G[3] * 3 * nsnps);


	for (int i = 1, offset = 0; i <= totalNumberOfGridBlock; i++, offset = offset + blockNum*threadNum)
	{
		if (i % 100 == 0)
		{
			printf("\rProgress:%d%%", (int)floor(((float)i / totalNumberOfGridBlock) * 100));
			fflush(stdout);
		}

		//generate index for ciomputation
		for (; snp1 < nsnps - 1; snp1++)
		{
			if (firstLoop)
			{
				firstLoop = false;
			}
			else
			{
				snp2 = snp1 + 1;
			}
			for (; snp2 < nsnps; snp2++)
			{
				interactionInputOffsetJ1[shiftOffset] = snp1;
				interactionInputOffsetJ2[shiftOffset] = snp2;
				shiftOffset++;
				if (shiftOffset == blockNum*threadNum){
					snp2++;
					break;
				}
			}

			if (shiftOffset == blockNum*threadNum){
				break;
			}
		}

		cudaMemset(gpu_InteractionPairOffsetJ1, 0, sizeof(int)*blockNum*threadNum);
		cudaMemset(gpu_InteractionPairOffsetJ2, 0, sizeof(int)*blockNum*threadNum);
		cudaMemset(gpu_interactionPairMeasure, 0, sizeof(float)*blockNum*threadNum);
		
		firstLoop = true;
		shiftOffset = 0;
 		//test_pairwiseTest_Kernel(genoY_G, nsnps, nlongintY_G, interactionInputOffsetJ1, interactionInputOffsetJ2, pMarginalDistrSNP, pMarginalDistrSNP_Y,
		//	interactionPairOffsetJ1, interactionPairOffsetJ2, interactionPairMeasure, thresholdRecord);
		
		pairwiseTest_Kernel << <grids, threads >> >((uint64**)gpu_genoY_G_Array, nsnps, gpu_nlongintY_G, gpu_inputOffsetJ1, gpu_inputOffsetJ2, gpu_pMarginalDistrSNP, gpu_pMarginalDistrSNP_Y,
				gpu_InteractionPairOffsetJ1, gpu_InteractionPairOffsetJ2, gpu_interactionPairMeasure, thresholdRecord,numOfCov);
		
		//pairwiseTest()
		checkCUDAError("Kernel Error!");
		if (i == totalNumberOfGridBlock)
		{
			
			cudaMemcpy(interactionPairOffsetJ1, gpu_InteractionPairOffsetJ1, sizeof(int)*blockNum*threadNum, cudaMemcpyDeviceToHost);
			checkCUDAError("Kernel Error!");
			cudaMemcpy(interactionPairOffsetJ2, gpu_InteractionPairOffsetJ2, sizeof(int)*blockNum*threadNum, cudaMemcpyDeviceToHost);
			cudaMemcpy(interactionPairMeasure, gpu_interactionPairMeasure, sizeof(float)*blockNum*threadNum, cudaMemcpyDeviceToHost);
			
			for (int j = 0; j < totalTasks % (blockNum*threadNum); j++)
			{
				if (interactionPairOffsetJ2[j] != 0)
				{
					interactionPairs.push_back(interactionPairOffsetJ1[j]);
					interactionPairs.push_back(interactionPairOffsetJ2[j]);
					interactionMeasure.push_back(interactionPairMeasure[j]);
				}
			}
		}
		else
		{
			cudaMemcpy(interactionPairOffsetJ1, gpu_InteractionPairOffsetJ1, sizeof(int)*blockNum*threadNum, cudaMemcpyDeviceToHost);

			cudaMemcpy(interactionPairOffsetJ2, gpu_InteractionPairOffsetJ2, sizeof(int)*blockNum*threadNum, cudaMemcpyDeviceToHost);
			cudaMemcpy(interactionPairMeasure, gpu_interactionPairMeasure, sizeof(float)*blockNum*threadNum, cudaMemcpyDeviceToHost);
			checkCUDAError("Kernel Error!");
			for (int j = 0; j < blockNum*threadNum; j++)
			{
				if (interactionPairOffsetJ2[j] != 0)
				{
					interactionPairs.push_back(interactionPairOffsetJ1[j]);
					interactionPairs.push_back(interactionPairOffsetJ2[j]);
					interactionMeasure.push_back(interactionPairMeasure[j]);
				}
			}
		}
		checkCUDAError("Kernel Error!");
	}
	printf("\rProgress:%d%\n", 100);

	
	cudaUnbindTexture(geno_00_Texture);
	cudaUnbindTexture(geno_01_Texture);
	cudaUnbindTexture(geno_10_Texture);
	cudaUnbindTexture(geno_11_Texture);
	cudaUnbindTexture(wordbits_Texture);

	cudaFree(gpu_wordbits);

	for (int i = 0; i < numOfTable; i++)
	{
		cudaFree(gpu_genoY_G[i]);
	}
	cudaFree(gpu_genoY_G);
	cudaFree(gpu_pMarginalDistrSNP);
	cudaFree(gpu_pMarginalDistrSNP_Y);
	cudaFree(gpu_InteractionPairOffsetJ1);
	cudaFree(gpu_InteractionPairOffsetJ2);
	cudaFree(gpu_nlongintY_G);
	cudaFree(gpu_interactionPairMeasure);

	cudaFreeHost(interactionInputOffsetJ1);
	cudaFreeHost(interactionInputOffsetJ2);

	//cudaEventRecord(evStop, 0);
	//cudaEventSynchronize(evStop);

	/*cudaEventElapsedTime(&timeInMs, evStart, evStop);

	printf("GPU Time = %fms\n",timeInMs);

	cudaEventDestroy(evStart);
	cudaEventDestroy(evStop);*/

	free(interactionPairOffsetJ1);
	free(interactionPairOffsetJ2);
	free(interactionPairMeasure);
}


